df <- as.data.frame(read_dta("./Cleaned_nuMoM2B_dataset_draft_8.31.2021.dta"))
a<-df %>% dplyr::select(numomid,pree_acog,pct_emptyc,d_totdens,p_totdens,f_totdens,p_seaplantdens,sodium_dens,g_nwhldens,g_whldens,v_totdens,f_soliddens,v_beangrendens,fatratio,heix10_sodium,momeduc,married,insurance,momrace4,prediab,prehtn,gravcat,v1_pregplanned,smokerpre,momage,bmiprepreg,dt_kcal)
a$pree_acog<-as.factor(a$pree_acog)
a$momeduc<-as.factor(a$momeduc)
a$married<-as.factor(a$married)
a$insurance<-as.factor(a$insurance)
a$momrace4<-as.factor(a$momrace4)
a$prediab<-as.factor(a$prediab)
a$prehtn<-as.factor(a$prehtn)
a$v1_pregplanned<-as.factor(a$v1_pregplanned)
a$smokerpre<-as.factor(a$smokerpre)
a$gravcat<-as.factor(a$gravcat)
#str(a)
#a
aggr(a)
Mode <- function (x, na.rm) {
xtab <- table(x)
xmode <- names(which(xtab == max(xtab)))
if (length(xmode) > 1) xmode <- ">1 mode"
return(xmode)
}
for (var in 1:ncol(a)) {
if (class(a[,var])=="numeric") {
a[is.na(a[,var]),var] <- mean(a[,var], na.rm = TRUE)
} else if (class(a[,var]) %in% c("character", "factor")) {
a[is.na(a[,var]),var] <- Mode(a[,var], na.rm = TRUE)
}
}
aggr(a)
summary(a)
## numomid pree_acog pct_emptyc d_totdens
## Length:10038 0:9204 Min. : 8.213 Min. :0.04607
## Class :character 1: 834 1st Qu.: 26.993 1st Qu.:0.60555
## Mode :character Median : 31.833 Median :0.87474
## Mean : 31.833 Mean :0.87474
## 3rd Qu.: 35.143 3rd Qu.:0.99575
## Max. :136.413 Max. :4.61078
## p_totdens f_totdens p_seaplantdens sodium_dens
## Min. :0.1362 Min. :0.000382 Min. :0.003639 Min. :0.5943
## 1st Qu.:2.0155 1st Qu.:0.493894 1st Qu.:0.465902 1st Qu.:1.4859
## Median :2.4042 Median :0.831714 Median :0.801858 Median :1.6116
## Mean :2.4042 Mean :0.831714 Mean :0.801858 Mean :1.6116
## 3rd Qu.:2.6193 3rd Qu.:1.007284 3rd Qu.:0.959537 3rd Qu.:1.7375
## Max. :9.4185 Max. :5.613823 Max. :4.786033 Max. :3.4519
## g_nwhldens g_whldens v_totdens f_soliddens
## Min. :0.1426 Min. :0.0000 Min. :0.006568 Min. :0.000382
## 1st Qu.:1.8369 1st Qu.:0.3360 1st Qu.:0.630542 1st Qu.:0.265075
## Median :2.2344 Median :0.6309 Median :0.954826 Median :0.574514
## Mean :2.2344 Mean :0.6309 Mean :0.954826 Mean :0.574514
## 3rd Qu.:2.5424 3rd Qu.:0.7809 3rd Qu.:1.090429 3rd Qu.:0.718450
## Max. :9.8700 Max. :3.6824 Max. :5.454516 Max. :5.008740
## v_beangrendens fatratio heix10_sodium momeduc married insurance
## Min. :0.0000 Min. :0.796 Min. : 0.000 0: 818 0:3911 1:6821
## 1st Qu.:0.1031 1st Qu.:1.703 1st Qu.: 2.916 1:1178 1:6127 2:3036
## Median :0.2361 Median :1.947 Median : 4.370 2:2957 3: 181
## Mean :0.2534 Mean :1.947 Mean : 4.370 3:5085
## 3rd Qu.:0.2959 3rd Qu.:2.086 3rd Qu.: 5.712
## Max. :2.9187 Max. :4.580 Max. :10.000
## momrace4 prediab prehtn gravcat v1_pregplanned smokerpre momage
## 1:5952 0:9891 0:9745 1:7448 1:5877 0:8256 Min. :13.00
## 2:1424 1: 147 1: 293 2:1916 2:4161 1:1782 1st Qu.:22.00
## 3:1747 3: 674 Median :27.00
## 4: 915 Mean :26.94
## 3rd Qu.:31.00
## Max. :45.00
## bmiprepreg dt_kcal
## Min. :11.54 Min. : 25.12
## 1st Qu.:21.33 1st Qu.: 1254.67
## Median :23.94 Median : 1691.30
## Mean :25.62 Mean : 1720.19
## 3rd Qu.:28.06 3rd Qu.: 1866.36
## Max. :72.49 Max. :17054.34
We can use ggpairs to explore scatter plot sets:
## you may need to run this outside the md environment, bc the figure created is very large.
a %>% {
bind_cols(
select_if(., is.numeric),
select_at(., "momrace4")
)
} %>%
GGally::ggpairs(.,
columns=1:16,
mapping = ggplot2::aes(colour=momrace4),
lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1, se=F)),
upper = list(continuous = wrap(ggally_cor, stars=FALSE)))
ggsave("./NuMOM_pairwise-2021_10_19.tiff",width=600,height=600,units="mm",limitsize=F,dpi=300, compression = "lzw")
a%>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(bins=15)
## Warning: attributes are not identical across measure variables;
## they will be dropped
a%>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()
## Warning: attributes are not identical across measure variables;
## they will be dropped
a%>%
keep(is.factor) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_bar()
## Warning: attributes are not identical across measure variables;
## they will be dropped
b<- a%>% dplyr::select(bmiprepreg,pct_emptyc,d_totdens,f_soliddens,heix10_sodium,fatratio,dt_kcal,momrace4,momeduc,gravcat,smokerpre)
b%>%gather(-bmiprepreg,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = bmiprepreg, y = value,color=momrace4)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('BMI and dietary varables scatter plots grouped by mom race')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
b%>%gather(-bmiprepreg,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = bmiprepreg, y = value,color=momeduc)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('BMI and dietary varables scatter plots grouped by mom education')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
b%>%gather(-bmiprepreg,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = bmiprepreg, y = value,color=gravcat)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('BMI and dietary varables scatter plots grouped by mom gravidity')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
b%>%gather(-bmiprepreg,-momrace4,-momeduc,-gravcat,-smokerpre,,key="var",value="value")%>% ggplot(aes(x = bmiprepreg, y = value,color=smokerpre)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('BMI and dietary varables scatter plots grouped by mom smoking status')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
c<- a%>% dplyr::select(momage,pct_emptyc,d_totdens,f_soliddens,heix10_sodium,fatratio,dt_kcal,momrace4,momeduc,gravcat,smokerpre)
c%>%gather(-momage,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = momage, y = value,color=momrace4)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('Age and dietary varables scatter plots grouped by mom race')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
c%>%gather(-momage,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = momage, y = value,color=momeduc)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('Age and dietary varables scatter plots grouped by mom education')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
c%>%gather(-momage,-momrace4,-momeduc,-gravcat,-smokerpre,,key="var",value="value")%>% ggplot(aes(x = momage, y = value,color=gravcat)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('Age and dietary varables scatter plots grouped by gravidity')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
c%>%gather(-momage,-momrace4,-momeduc,-gravcat,-smokerpre,key="var",value="value")%>% ggplot(aes(x = momage, y = value,color=smokerpre)) + geom_point(alpha=0.15) + geom_smooth() + facet_wrap(~ var,scales="free")+theme_bw()+ggtitle('Age and dietary varables scatter plots grouped by mom smoking status')
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
scale_a<-a %>%select(-numomid)%>% mutate_if(is.numeric, scale)
fviz_nbclust(scale_a,kmeans,method='wss')
fviz_nbclust(scale_a,kmeans,method='silhouette')
k2<-kmeans(scale_a,centers=2,nstart=50)
#fviz_cluster(km, geom = "point", data = scale_a)
scale_a$clusters_k2<-k2$cluster
cluster_model_k2<-vglm(factor(clusters_k2) ~.,data=scale_a,family='multinomial')
summary(cluster_model_k2)
##
## Call:
## vglm(formula = factor(clusters_k2) ~ ., family = "multinomial",
## data = scale_a)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.33372 0.38117 -3.499 0.000467 ***
## pree_acog1 0.04783 0.16969 0.282 0.778025
## pct_emptyc -1.21173 0.11796 -10.273 < 2e-16 ***
## d_totdens 0.02403 0.06883 0.349 0.726971
## p_totdens 0.51362 0.09708 5.291 1.22e-07 ***
## f_totdens 0.83268 0.10885 7.650 2.01e-14 ***
## p_seaplantdens 0.93166 0.07861 11.852 < 2e-16 ***
## sodium_dens 0.73679 0.38583 1.910 0.056186 .
## g_nwhldens -0.48298 0.08077 -5.980 2.23e-09 ***
## g_whldens 0.78123 0.06359 12.285 < 2e-16 ***
## v_totdens 1.17293 0.17832 6.578 4.78e-11 ***
## f_soliddens 0.96427 0.11336 8.506 < 2e-16 ***
## v_beangrendens 0.93544 0.14502 6.450 1.12e-10 ***
## fatratio 0.88192 0.08398 10.502 < 2e-16 ***
## heix10_sodium -0.56732 0.36505 -1.554 0.120161
## momeduc1 0.18604 0.40142 0.463 0.643041
## momeduc2 1.51470 0.35996 4.208 2.58e-05 ***
## momeduc3 3.09217 0.37137 8.326 < 2e-16 ***
## married1 0.77749 0.13507 5.756 8.61e-09 ***
## insurance2 -0.96936 0.14216 -6.819 9.18e-12 ***
## insurance3 -1.68268 0.36004 -4.674 2.96e-06 ***
## momrace42 -0.68812 0.17700 -3.888 0.000101 ***
## momrace43 -1.13737 0.14217 -8.000 1.24e-15 ***
## momrace44 -1.53370 0.17460 -8.784 < 2e-16 ***
## prediab1 -0.30697 0.41426 -0.741 0.458692
## prehtn1 -0.13190 0.28396 -0.465 0.642283
## gravcat2 -0.04823 0.12231 -0.394 0.693335
## gravcat3 -0.13971 0.19875 -0.703 0.482077
## v1_pregplanned2 -0.63513 0.11401 -5.571 2.53e-08 ***
## smokerpre1 -0.28149 0.14222 -1.979 0.047791 *
## momage 1.50382 0.07320 20.544 < 2e-16 ***
## bmiprepreg -0.31963 0.05108 -6.258 3.90e-10 ***
## dt_kcal -0.46464 0.07814 -5.947 2.74e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Name of linear predictor: log(mu[,1]/mu[,2])
##
## Residual deviance: 1523.362 on 10005 degrees of freedom
##
## Log-likelihood: -761.6808 on 10005 degrees of freedom
##
## Number of Fisher scoring iterations: 15
##
## No Hauck-Donner effect found in any of the estimates
##
##
## Reference group is level 2 of the response
g1<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=momrace4))+ggtitle("Cluster groups by mom race")
g2<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=momeduc))+ggtitle("Cluster groups by mom education")
g3<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=smokerpre))+ggtitle("Cluster groups by smoking status")
g4<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=gravcat))+ggtitle("Cluster groups by gravidity")
g5<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=married))+ggtitle("Cluster groups by marriage status")
g6<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=insurance))+ggtitle("Cluster groups by insurance status")
g7<-scale_a%>% ggplot(aes(y=clusters_k2))+geom_bar(aes(fill=pree_acog))+ggtitle("Cluster groups by preeclampsia")
grid.arrange(g1,g2,g3,g4,g5,g6,g7,ncol=2,nrow=4)